from bs4 import BeautifulSoup
import os
import nltk
import fitz
import re 
from nltk.tokenize import word_tokenize
from collections import Counter

nltk.download('punkt')

def clean(text):
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text) # URL
    text = text.replace('\x0a', '').replace('\x0d', '') # non-printable
    text = re.sub(r'[^\x20-\x7E]+', '', text)  
    text = re.sub(r'[,.:;!`\'"]', ' ', text) # punctuation
    text = ' '.join(text.split()) # spaces
    return text

def extract_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc.pages():
        text += page.get_text()
    return word_tokenize(clean(text.lower()))

def extract_html(html_path):
    with open(html_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        text = soup.get_text()
    return word_tokenize(clean(text.lower()))

def file_type(input_path):
    if input_path.lower().endswith('.pdf'):
        words_list = extract_pdf(input_path)
    elif input_path.lower().endswith('.html'):
        words_list = extract_html(input_path)
    else:
        print("Unsupported file format")
        return words_list

    word_pairs = [(words_list[i], words_list[i + 1]) for i in range(len(words_list) - 1)]
    return word_pairs

def process_input(input_dir, output_file):
    all_word_pairs = []
    for filename in os.listdir(input_dir):
        file_path = os.path.join(input_dir, filename)
        if os.path.isfile(file_path) and (file_path.lower().endswith('.pdf') or file_path.lower().endswith('.html')):
            word_pairs = file_type(file_path)
            all_word_pairs.extend(word_pairs)

    counts = Counter(all_word_pairs) 
    sorted_pairs = sorted(counts.items(), key=lambda x: x[1], reverse=True)

    with open(output_file, 'w', encoding='utf-8') as output_file:
        for word_pair, count in sorted_pairs:
            output_file.write(f"{word_pair[0]} {word_pair[1]} : {count}\n")

if __name__ == "__main__":
    input_dir = r"/usr/share/dict/words"
    output_file = r"/usr/share/dict/words/word_pairs.txt"
    process_input(input_dir, output_file)